Comparison

In the following notebook we're going to compare all the approaches we developed and test them on a small amount of professions for which we have precise statics about. We will use both FastText's and Gonen's embeddings.

In [1]:
# general
import pandas as pd
import numpy as np
import scipy.stats
import codecs
from matplotlib import pyplot as plt
from numpy import linalg as LA

# bolukbasi's
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding

# mine
import db_functions as db

Step 1: Load embeddings

We load all the emdeddings using the same methods and applying the same preprocessing in order to have a common start.

In [2]:
# load Gonen's emdebbings
E_g =  WordEmbedding('../gonen/embeddings/it_lemma_to_fem', 'g')
*** Reading data from ../gonen/embeddings/it_lemma_to_fem
(160597, 300)
160597 words of dimension 300 : </s>, ,, di, ., ..., title="henrik, pass">, title="nils, title="derrick
160597 words of dimension 300 : </s>, ,, di, ., ..., title="henrik, pass">, title="nils, title="derrick
In [3]:
# load FastText's emdeddings
E_ft =  WordEmbedding('../bolukbasi/mio/embeddings/cc.it.300.vec', 'ft')
*** Reading data from ../bolukbasi/mio/embeddings/cc.it.300.vec
(2000000, 300)
2000000 words of dimension 300 : ,, di, ., </s>, ..., Kelyn, golfclub, metallo-organici, ricercaLatin
2000000 words of dimension 300 : ,, di, ., </s>, ..., Kelyn, golfclub, metallo-organici, ricercaLatin

Step 2: working whit FastText's embeddings

Step 2.1: vector difference

In [4]:
# gender direction
g_diff = E_ft.diff('lui', 'lei')

Step 2.2: PCA

In [5]:
# list of pairs to define gender
'''
gender_pairs = [['lei','lui'],
                ['donna','uomo'], 
                ['madre','padre'],
                ['moglie','marito'],
                ['sorella','fratello'],
                ['femmina','maschio']]
'''

gender_pairs = [['lui','lei'],
                ['uomo','donna'], 
                ['padre','madre'],
                ['marito','moglie'],
                ['fratello','sorella'],
                ['maschio','femmina']]
In [6]:
# PCA
pca = we.doPCA(gender_pairs, E_ft)

# PCA components
pc = pca.components_

# singular values
sv = pca.singular_values_
In [7]:
print("Singular values:")
print(sv)
Singular values:
[9.5309240e-01 5.3566736e-01 3.7772956e-01 3.4538972e-01 2.9901835e-01
 2.4023318e-01 5.8411594e-08 4.6098762e-08 4.3273410e-08 4.1712632e-08]
In [8]:
plt.bar(range(pca.n_components_), sv)
plt.title("Singular values")
plt.show()
In [9]:
# gender direction
g_pca_0  = pc[0]/LA.norm(pc[0])
g_pca_01 = (sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1])/LA.norm((sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1]))

Step 2.3: pseudo-LDA

In [10]:
'''
S_word = [['lei','lui'],
          ['donna','uomo'],
          ['madre','padre'],
          ['moglie','marito'],
          ['sorella','fratello'],
          ['femmina','maschio']]
'''

S_word = [['lui','lei'],
          ['uomo','donna'], 
          ['padre','madre'],
          ['marito','moglie'],
          ['fratello','sorella'],
          ['maschio','femmina']]

G_word = db.read('gram_def_mf.txt', 'mf')
In [11]:
S_m_v = []
for w in S_word:
    S_m_v.append(E_ft.v(w[1]))
    
S_f_v = []
for w in S_word:
    S_f_v.append(E_ft.v(w[0]))
    
G_m_v = []
for w in G_word:
    G_m_v.append(E_ft.v(w[0]))
    
G_f_v = []
for w in G_word:
    G_f_v.append(E_ft.v(w[1]))

S_m = np.array(S_m_v).T
S_f = np.array(S_f_v).T
G_m = np.array(G_m_v).T
G_f = np.array(G_f_v).T
In [12]:
# delta semantic
Delta_S = np.dot(np.subtract(S_m, S_f), np.subtract(S_m, S_f).T)

# delta grammatical
Delta_G = np.dot(np.subtract(G_m, G_f), np.subtract(G_m, G_f).T)

# A
A = np.dot(np.linalg.inv(Delta_G), Delta_S)
In [13]:
# eigenvalues and eigenvectors
w, v = np.linalg.eig(A)

# gender direction: pick the eigenvector associated to the largest eigenvalue
g_e = v[:,np.argmax(w)] # already normalized

Step 2.4: compute projections for ungendered professions

In [14]:
# read professions
prof_ung_truth = db.read('professions_ung.csv', 'truth')
In [15]:
# compute projection of each profession on g_diff
proj_truth_g_diff = db.prof_proj(E_ft, prof_ung_truth, g_diff, 'istat')

# compute projection of each profession on g_pca_0
proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_ung_truth, g_pca_0, 'istat')

# compute projection of each profession on g_pca_01
proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_ung_truth, g_pca_01, 'istat')

# compute projection of each profession on g_e
proj_truth_g_e = db.prof_proj(E_ft, prof_ung_truth, g_e, 'istat')
In [16]:
# print results
truth_col0 = [] # pandas columns
truth_col1 = []
truth_col2 = []
truth_col3 = []
truth_col4 = []
truth_col5 = []
truth_col6 = []

for i in range(len(proj_truth_g_diff)):
    truth_col0.append(proj_truth_g_diff[i][0]) # profession
    truth_col1.append(proj_truth_g_diff[i][1]) # projection on g_diff
    truth_col2.append(proj_truth_g_pca_0[i][1]) # projection on g_pca_0
    truth_col3.append(proj_truth_g_pca_01[i][1]) # projection on g_pca_01
    truth_col4.append(np.real(proj_truth_g_e[i][1])) # projection on g_e
    truth_col5.append(proj_truth_g_diff[i][2]) # %male
    truth_col6.append(proj_truth_g_diff[i][3]) # %female

    
truth_data = {'Profession':truth_col0, 
           'Proj g_diff':truth_col1,
           'Proj g_pca_0':truth_col2, 
           'Proj g_pca_01':truth_col3,
           'Prog g_e':truth_col4,
           '% male':truth_col5, 
           '% female':truth_col6}
truth_table = pd.DataFrame(truth_data)
truth_table
Out[16]:
Profession Proj g_diff Proj g_pca_0 Proj g_pca_01 Prog g_e % male % female
0 camionista 0.027856 0.102896 0.050494 0.065189 96.9 3.1
1 elettricista 0.073085 0.122825 0.115196 0.019939 96.8 3.2
2 ingegnere 0.043748 0.217653 0.167188 0.026228 83.6 16.4
3 architetto 0.076696 0.183021 0.165500 0.059626 64.7 35.3
4 notaio 0.082010 0.171843 0.112616 0.022859 66.4 33.6
5 commercialista 0.024475 0.113249 0.061505 0.053153 68.2 31.8
6 agrotecnico 0.004359 0.110057 0.080172 -0.008657 85.5 14.5
7 giornalista -0.048235 0.050688 -0.002423 -0.006167 54.7 45.3
8 veterinario 0.028022 0.161850 0.089791 -0.024446 40.8 59.2
9 insegnante -0.103818 -0.055340 -0.070229 0.020056 17.3 82.7
10 barista 0.027839 0.052001 0.019284 0.062231 38.9 61.1
11 medico -0.004724 0.135863 0.039559 -0.040868 56.3 43.7
12 ostetrica -0.231592 -0.285271 -0.312216 0.026011 1.7 98.3
13 dentista 0.004349 0.097455 0.028230 0.023513 64.8 35.2
14 badante -0.126959 -0.221358 -0.217654 0.001727 12.2 87.8
15 farmacista -0.054389 0.054562 -0.022261 -0.017615 30.8 69.2
16 dietista -0.169253 -0.125306 -0.203371 -0.007898 23.7 76.3
17 igienista -0.035202 -0.005823 -0.005887 0.002305 26.3 73.7
18 preside -0.081926 0.021247 -0.041888 0.014855 33.8 66.2
In [17]:
truth_table.to_csv('truth_ung_ft.csv', index=False)
In [18]:
# pearson correlation
r_g_diff = scipy.stats.pearsonr(truth_col6, truth_col1)[0]
r_g_pca_0 = scipy.stats.pearsonr(truth_col6, truth_col2)[0]
r_g_pca_01 = scipy.stats.pearsonr(truth_col6, truth_col3)[0]
r_g_e = scipy.stats.pearsonr(truth_col6, truth_col4)[0]
In [19]:
pearson_data = {'g_diff':r_g_diff,
                'g_pca_0':r_g_pca_0,
                'g_pca_01':r_g_pca_01,
                'g_e':r_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
Out[19]:
g_diff g_pca_0 g_pca_01 g_e
r -0.805963 -0.78981 -0.804142 -0.286917
In [20]:
# correlation plot g_diff
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_diff")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_diff')
In [21]:
# correlation plot g_pca_0
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_0")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_pca_0')
In [22]:
# correlation plot g_pca_01
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_01")

ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_pca_01')
In [23]:
# correlation plot g_e
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_e")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_e')
In [24]:
# scatter plots

plt.figure(figsize=(18,18))    

ax1 = plt.subplot(221)
ax1.scatter(truth_col6, truth_col1, c='blue')
ax1.set_title('g_diff')
ax1.set_xlabel('% female')
ax1.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
ax2 = plt.subplot(222)
ax2.scatter(truth_col6, truth_col2, c='red')
ax2.set_title('g_pca_0')
ax2.set_xlabel('% female')
ax2.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
ax3 = plt.subplot(223)
ax3.scatter(truth_col6, truth_col3, c='green')
ax3.set_title('g_pca_01')
ax3.set_xlabel('% female')
ax3.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
ax4 = plt.subplot(224)
ax4.scatter(truth_col6, truth_col4, c='magenta')
ax4.set_title('g_e')
ax4.set_xlabel('% female')
ax4.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center


plt.show()
fig.savefig('plot/ung-prof/scatter/ft_all')
In [25]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.set_title('g_diff vs g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_diff_vs_g_pca_0')
In [26]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_diff vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_diff_vs_g_pca_01')
In [27]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_diff vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_diff_vs_g_e')
In [28]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_pca_0 vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_pca_0_vs_g_pca_01')
In [29]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_0 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_pca_0_vs_g_e')
In [30]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_01 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_pca_01_vs_g_e')

Step 2.5: compute projections for gendered professions

In [31]:
import importlib
importlib.reload(db)
Out[31]:
<module 'db_functions' from '/Users/davidebiasion/Documents/tesi/prova dati/comparativa/db_functions.py'>
In [32]:
# read professions
prof_gen_truth = db.read('professions_gen.csv', 'truth-mf')
In [33]:
# compute projection of each profession on g_diff
mf_proj_truth_g_diff = db.prof_proj(E_ft, prof_gen_truth, g_diff, 'istat-mf')

# compute projection of each profession on g_pca_0
mf_proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_gen_truth, g_pca_0, 'istat-mf')

# compute projection of each profession on g_pca_01
mf_proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_gen_truth, g_pca_01, 'istat-mf')

# compute projection of each profession on g_e
mf_proj_truth_g_e = db.prof_proj(E_ft, prof_gen_truth, g_e, 'istat-mf')
In [34]:
# print results
mf_truth_col0 = [] # pandas columns
mf_truth_col1 = []
mf_truth_col2 = []
mf_truth_col3 = []
mf_truth_col4 = []
mf_truth_col5 = []
mf_truth_col6 = []
mf_truth_col7 = []
mf_truth_col8 = []
mf_truth_col9 = []
mf_truth_col10 = []

for i in range(len(mf_proj_truth_g_diff)):
    mf_truth_col0.append(mf_proj_truth_g_diff[i][0]) # male profession
    mf_truth_col1.append(mf_proj_truth_g_diff[i][1]) # male projection on g_diff
    mf_truth_col2.append(mf_proj_truth_g_pca_0[i][1]) # male projection on g_pca_0
    mf_truth_col3.append(mf_proj_truth_g_pca_01[i][1]) # male projection on g_pca_01
    mf_truth_col4.append(np.real(mf_proj_truth_g_e[i][1])) # male projection on g_e
    mf_truth_col5.append(mf_proj_truth_g_diff[i][2]) # female profession
    mf_truth_col6.append(mf_proj_truth_g_diff[i][3]) # female projection on g_diff
    mf_truth_col7.append(mf_proj_truth_g_pca_0[i][3]) # female projection on g_pca_0
    mf_truth_col8.append(mf_proj_truth_g_pca_01[i][3]) # female projection on g_pca_01
    mf_truth_col9.append(np.real(mf_proj_truth_g_e[i][3])) # female projection on g_e
    mf_truth_col10.append(mf_proj_truth_g_diff[i][4]) # %female
    
mf_truth_data = {'Male profession':mf_truth_col0, 
           'M-Proj g_diff':mf_truth_col1,
           'M-Proj g_pca_0':mf_truth_col2, 
           'M-Proj g_pca_01':mf_truth_col3,
           'M-Prog g_e':mf_truth_col4,
           'Female profession':mf_truth_col5, 
           'F-Proj g_diff':mf_truth_col6,
           'F-Proj g_pca_0':mf_truth_col7, 
           'F-Proj g_pca_01':mf_truth_col8,
           'F-Prog g_e':mf_truth_col9,  
           '% female':mf_truth_col10}

mf_truth_table = pd.DataFrame(mf_truth_data)
mf_truth_table
Out[34]:
Male profession M-Proj g_diff M-Proj g_pca_0 M-Proj g_pca_01 M-Prog g_e Female profession F-Proj g_diff F-Proj g_pca_0 F-Proj g_pca_01 F-Prog g_e % female
0 calzolaio 0.106336 0.155676 0.117852 0.022520 calzolaia -0.083221 -0.217826 -0.183558 -0.007811 8.0
1 biologo 0.066985 0.229635 0.178475 -0.050910 biologa -0.263955 -0.272445 -0.288617 0.001746 76.0
2 avvocato -0.053588 0.132720 0.056975 -0.010401 avvocatessa -0.291315 -0.283173 -0.312245 0.060756 47.2
3 psicologo 0.068424 0.169177 0.117352 -0.015112 psicologa -0.238524 -0.304119 -0.323389 0.005394 82.9
4 maestro 0.201372 0.312891 0.275900 0.023119 maestra -0.244963 -0.301025 -0.313171 0.037129 96.4
5 professore -0.029616 0.228402 0.129135 0.007852 professoressa -0.323749 -0.292575 -0.333926 0.014332 71.9
6 cameriere 0.046219 0.097974 0.054632 0.046446 cameriera -0.189663 -0.259899 -0.277246 0.015340 51.4
7 albergatore 0.081593 0.130638 0.134854 -0.004077 albergatrice -0.200101 -0.274839 -0.248046 0.043339 50.6
8 infermiere -0.044242 0.007298 -0.023620 0.044576 infermiera -0.253701 -0.257126 -0.294478 0.042178 77.0
9 geologo 0.035964 0.187185 0.125979 -0.042255 geologa -0.277250 -0.278603 -0.292457 0.024115 39.8
10 biologo 0.066985 0.229635 0.178475 -0.050910 biologa -0.263955 -0.272445 -0.288617 0.001746 71.4
11 zoologo 0.045325 0.177953 0.143061 -0.044046 zoologa -0.236281 -0.258107 -0.251821 0.010222 70.5
12 filosofo 0.154506 0.256563 0.246152 -0.077562 filosofa -0.214466 -0.272772 -0.264087 -0.056505 51.9
In [35]:
mf_truth_table.to_csv('truth_gen_ft.csv', index=False)
In [36]:
# compute mean male-female projection
mf_mean_g_diff = (np.array(mf_truth_col1)+np.array(mf_truth_col6))/2
mf_mean_g_pca_0 = (np.array(mf_truth_col2)+np.array(mf_truth_col7))/2
mf_mean_g_pca_01 = (np.array(mf_truth_col3)+np.array(mf_truth_col8))/2
mf_mean_g_e = (np.array(mf_truth_col4)+np.array(mf_truth_col9))/2
In [37]:
# pearson correlation
r_mean_g_diff = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_diff)[0]
r_mean_g_pca_0 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_0)[0]
r_mean_g_pca_01 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_01)[0]
r_mean_g_e = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_e)[0]
In [38]:
pearson_data = {'g_diff':r_mean_g_diff,
                'g_pca_0':r_mean_g_pca_0,
                'g_pca_01':r_mean_g_pca_01,
                'g_e':r_mean_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
Out[38]:
g_diff g_pca_0 g_pca_01 g_e
r -0.274013 0.082882 -0.10993 0.066384
In [39]:
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_diff')
In [40]:
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_diff)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_diff, c='brown', label='mean proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_diff_mean')
In [41]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_0')
In [42]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_0)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_0, c='brown', label='mean proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_0_mean')
In [43]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_01')
In [45]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_01)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_01, c='brown', label='mean proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_01_mean')
In [46]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_e')
In [47]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_e)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_e, c='brown', label='mean proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_e_mean')

Step 2.6: subtracting grammatical gender to the emdeddings

In [48]:
from debiaswe.debias import debias
In [49]:
debias(E_ft, [], G_word, [])
2000000 words of dimension 300 : ,, di, ., </s>, ..., Kelyn, golfclub, metallo-organici, ricercaLatin
set()
2000000 words of dimension 300 : ,, di, ., </s>, ..., Kelyn, golfclub, metallo-organici, ricercaLatin

Step 2.7: repeating Steps 2.1-2.5

In [50]:
# gender direction
g_diff = E_ft.diff('lui', 'lei')
In [51]:
# list of pairs to define gender
'''
gender_pairs = [['lei','lui'],
                ['donna','uomo'], 
                ['madre','padre'],
                ['moglie','marito'],
                ['sorella','fratello'],
                ['femmina','maschio']]
'''

gender_pairs = [['lui','lei'],
                ['uomo','donna'], 
                ['padre','madre'],
                ['marito','moglie'],
                ['fratello','sorella'],
                ['maschio','femmina']]
In [52]:
# PCA
pca = we.doPCA(gender_pairs, E_ft)

# PCA components
pc = pca.components_

# singular values
sv = pca.singular_values_
In [53]:
print("Singular values:")
print(sv)
Singular values:
[6.9684738e-01 5.0248235e-01 3.8360339e-01 3.4403831e-01 3.0402446e-01
 2.4636908e-01 6.0329249e-08 4.8515240e-08 4.6014925e-08 3.9377852e-08]
In [54]:
plt.bar(range(pca.n_components_), sv)
plt.title("Singular values")
plt.show()
In [55]:
# gender direction
g_pca_0  = pc[0]/LA.norm(pc[0])
g_pca_01 = (sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1])/LA.norm((sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1]))
In [56]:
'''
S_word = [['lei','lui'],
          ['donna','uomo'],
          ['madre','padre'],
          ['moglie','marito'],
          ['sorella','fratello'],
          ['femmina','maschio']]
'''

S_word = [['lui','lei'],
          ['uomo','donna'], 
          ['padre','madre'],
          ['marito','moglie'],
          ['fratello','sorella'],
          ['maschio','femmina']]

G_word = db.read('gram_def_mf.txt', 'mf')
In [57]:
S_m_v = []
for w in S_word:
    S_m_v.append(E_ft.v(w[1]))
    
S_f_v = []
for w in S_word:
    S_f_v.append(E_ft.v(w[0]))
    
G_m_v = []
for w in G_word:
    G_m_v.append(E_ft.v(w[0]))
    
G_f_v = []
for w in G_word:
    G_f_v.append(E_ft.v(w[1]))

S_m = np.array(S_m_v).T
S_f = np.array(S_f_v).T
G_m = np.array(G_m_v).T
G_f = np.array(G_f_v).T
In [58]:
# delta semantic
Delta_S = np.dot(np.subtract(S_m, S_f), np.subtract(S_m, S_f).T)

# delta grammatical
Delta_G = np.dot(np.subtract(G_m, G_f), np.subtract(G_m, G_f).T)

# A
A = np.dot(np.linalg.inv(Delta_G), Delta_S)
In [59]:
# eigenvalues and eigenvectors
w, v = np.linalg.eig(A)

# gender direction: pick the eigenvector associated to the largest eigenvalue
g_e = v[:,np.argmax(w)] # already normalized
In [60]:
# read professions
prof_ung_truth = db.read('professions_ung.csv', 'truth')
In [61]:
# compute projection of each profession on g_diff
proj_truth_g_diff = db.prof_proj(E_ft, prof_ung_truth, g_diff, 'istat')

# compute projection of each profession on g_pca_0
proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_ung_truth, g_pca_0, 'istat')

# compute projection of each profession on g_pca_01
proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_ung_truth, g_pca_01, 'istat')

# compute projection of each profession on g_e
proj_truth_g_e = db.prof_proj(E_ft, prof_ung_truth, g_e, 'istat')
In [62]:
# print results
truth_col0 = [] # pandas columns
truth_col1 = []
truth_col2 = []
truth_col3 = []
truth_col4 = []
truth_col5 = []
truth_col6 = []

for i in range(len(proj_truth_g_diff)):
    truth_col0.append(proj_truth_g_diff[i][0]) # profession
    truth_col1.append(proj_truth_g_diff[i][1]) # projection on g_diff
    truth_col2.append(proj_truth_g_pca_0[i][1]) # projection on g_pca_0
    truth_col3.append(proj_truth_g_pca_01[i][1]) # projection on g_pca_01
    truth_col4.append(np.real(proj_truth_g_e[i][1])) # projection on g_e
    truth_col5.append(proj_truth_g_diff[i][2]) # %male
    truth_col6.append(proj_truth_g_diff[i][3]) # %female

    
truth_data = {'Profession':truth_col0, 
           'Proj g_diff':truth_col1,
           'Proj g_pca_0':truth_col2, 
           'Proj g_pca_01':truth_col3,
           'Prog g_e':truth_col4,
           '% male':truth_col5, 
           '% female':truth_col6}
truth_table = pd.DataFrame(truth_data)
truth_table
Out[62]:
Profession Proj g_diff Proj g_pca_0 Proj g_pca_01 Prog g_e % male % female
0 camionista 0.000958 0.010437 0.057523 -0.028762 96.9 3.1
1 elettricista 0.051165 0.075229 0.060888 0.006139 96.8 3.2
2 ingegnere 0.006015 0.132715 0.159754 0.003354 83.6 16.4
3 architetto 0.048271 0.128841 0.118354 0.017533 64.7 35.3
4 notaio 0.047991 0.074732 0.122838 0.140066 66.4 33.6
5 commercialista -0.001897 0.031570 0.078392 0.078729 68.2 31.8
6 agrotecnico -0.015271 0.068142 0.090167 -0.022012 85.5 14.5
7 giornalista -0.066798 -0.020748 0.039422 0.051914 54.7 45.3
8 veterinario -0.022770 0.001387 0.046067 -0.089617 40.8 59.2
9 insegnante -0.099758 -0.070729 -0.047352 0.048904 17.3 82.7
10 barista 0.017341 0.004999 0.040029 -0.046712 38.9 61.1
11 medico -0.053320 -0.038994 0.043721 -0.014985 56.3 43.7
12 ostetrica -0.188618 -0.242364 -0.155075 -0.007405 1.7 98.3
13 dentista -0.025306 -0.014621 0.047869 -0.029015 64.8 35.2
14 badante -0.091202 -0.175202 -0.135458 0.033739 12.2 87.8
15 farmacista -0.076731 -0.045913 0.036305 -0.019926 30.8 69.2
16 dietista -0.160470 -0.173801 -0.055916 0.062114 23.7 76.3
17 igienista -0.026632 0.013891 0.016453 -0.046595 26.3 73.7
18 preside -0.093920 -0.041043 0.037194 0.061447 33.8 66.2
In [63]:
truth_table.to_csv('truth_ung_ft_deb.csv', index=False)
In [64]:
# pearson correlation
r_g_diff = scipy.stats.pearsonr(truth_col6, truth_col1)[0]
r_g_pca_0 = scipy.stats.pearsonr(truth_col6, truth_col2)[0]
r_g_pca_01 = scipy.stats.pearsonr(truth_col6, truth_col3)[0]
r_g_e = scipy.stats.pearsonr(truth_col6, truth_col4)[0]
In [65]:
pearson_data = {'g_diff':r_g_diff,
                'g_pca_0':r_g_pca_0,
                'g_pca_01':r_g_pca_01,
                'g_e':r_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
Out[65]:
g_diff g_pca_0 g_pca_01 g_e
r -0.769887 -0.791266 -0.795674 0.017077
In [66]:
# correlation plot g_diff
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_diff")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_diff')
In [67]:
# correlation plot g_pca_0
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_0")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_pca_0')
In [68]:
# correlation plot g_pca_01
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_01")

ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_pca_01')
In [69]:
# correlation plot g_e
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_e")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_e')
In [70]:
# scatter plots

plt.figure(figsize=(18,18))    

ax1 = plt.subplot(221)
ax1.scatter(truth_col6, truth_col1, c='blue')
ax1.set_title('g_diff')
ax1.set_xlabel('% female')
ax1.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
ax2 = plt.subplot(222)
ax2.scatter(truth_col6, truth_col2, c='red')
ax2.set_title('g_pca_0')
ax2.set_xlabel('% female')
ax2.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
ax3 = plt.subplot(223)
ax3.scatter(truth_col6, truth_col3, c='green')
ax3.set_title('g_pca_01')
ax3.set_xlabel('% female')
ax3.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
ax4 = plt.subplot(224)
ax4.scatter(truth_col6, truth_col4, c='magenta')
ax4.set_title('g_e')
ax4.set_xlabel('% female')
ax4.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center


plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_all')
In [71]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.set_title('g_diff vs g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_diff_vs_g_pca_0')
In [72]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_diff vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_diff_vs_g_pca_01')
In [73]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_diff vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_diff_vs_g_e')
In [74]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_pca_0 vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_pca_0_vs_g_pca_01')
In [75]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_0 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_pca_0_vs_g_e')
In [76]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_01 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_pca_01_vs_g_e')
In [77]:
# read professions
prof_gen_truth = db.read('professions_gen.csv', 'truth-mf')
In [78]:
# compute projection of each profession on g_diff
mf_proj_truth_g_diff = db.prof_proj(E_ft, prof_gen_truth, g_diff, 'istat-mf')

# compute projection of each profession on g_pca_0
mf_proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_gen_truth, g_pca_0, 'istat-mf')

# compute projection of each profession on g_pca_01
mf_proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_gen_truth, g_pca_01, 'istat-mf')

# compute projection of each profession on g_e
mf_proj_truth_g_e = db.prof_proj(E_ft, prof_gen_truth, g_e, 'istat-mf')
In [79]:
# print results
mf_truth_col0 = [] # pandas columns
mf_truth_col1 = []
mf_truth_col2 = []
mf_truth_col3 = []
mf_truth_col4 = []
mf_truth_col5 = []
mf_truth_col6 = []
mf_truth_col7 = []
mf_truth_col8 = []
mf_truth_col9 = []
mf_truth_col10 = []

for i in range(len(mf_proj_truth_g_diff)):
    mf_truth_col0.append(mf_proj_truth_g_diff[i][0]) # male profession
    mf_truth_col1.append(mf_proj_truth_g_diff[i][1]) # male projection on g_diff
    mf_truth_col2.append(mf_proj_truth_g_pca_0[i][1]) # male projection on g_pca_0
    mf_truth_col3.append(mf_proj_truth_g_pca_01[i][1]) # male projection on g_pca_01
    mf_truth_col4.append(np.real(mf_proj_truth_g_e[i][1])) # male projection on g_e
    mf_truth_col5.append(mf_proj_truth_g_diff[i][2]) # female profession
    mf_truth_col6.append(mf_proj_truth_g_diff[i][3]) # female projection on g_diff
    mf_truth_col7.append(mf_proj_truth_g_pca_0[i][3]) # female projection on g_pca_0
    mf_truth_col8.append(mf_proj_truth_g_pca_01[i][3]) # female projection on g_pca_01
    mf_truth_col9.append(np.real(mf_proj_truth_g_e[i][3])) # female projection on g_e
    mf_truth_col10.append(mf_proj_truth_g_diff[i][4]) # %female
    
mf_truth_data = {'Male profession':mf_truth_col0, 
           'M-Proj g_diff':mf_truth_col1,
           'M-Proj g_pca_0':mf_truth_col2, 
           'M-Proj g_pca_01':mf_truth_col3,
           'M-Prog g_e':mf_truth_col4,
           'Female profession':mf_truth_col5, 
           'F-Proj g_diff':mf_truth_col6,
           'F-Proj g_pca_0':mf_truth_col7, 
           'F-Proj g_pca_01':mf_truth_col8,
           'F-Prog g_e':mf_truth_col9,  
           '% female':mf_truth_col10}

mf_truth_table = pd.DataFrame(mf_truth_data)
mf_truth_table
Out[79]:
Male profession M-Proj g_diff M-Proj g_pca_0 M-Proj g_pca_01 M-Prog g_e Female profession F-Proj g_diff F-Proj g_pca_0 F-Proj g_pca_01 F-Prog g_e % female
0 calzolaio 0.079271 0.081505 0.105640 -0.049912 calzolaia -0.014804 -0.044504 -0.012927 -0.040669 8.0
1 biologo 0.015788 0.103029 0.119144 0.073356 biologa -0.223442 -0.215060 -0.141161 -0.045970 76.0
2 avvocato -0.086206 0.030805 0.101467 0.096192 avvocatessa -0.244240 -0.206941 -0.103990 0.016933 47.2
3 psicologo 0.025581 0.041972 0.066970 0.056083 psicologa -0.192896 -0.249648 -0.163175 0.034717 82.9
4 maestro 0.159218 0.235579 0.239014 0.040721 maestra -0.189827 -0.201996 -0.114366 0.040153 96.4
5 professore -0.084330 0.078977 0.159987 0.081700 professoressa -0.284365 -0.249782 -0.137540 0.030730 71.9
6 cameriere 0.007340 -0.034930 -0.014165 -0.006508 cameriera -0.143171 -0.192962 -0.110693 -0.009995 51.4
7 albergatore 0.057318 0.080634 0.044936 0.017233 albergatrice -0.144356 -0.151718 -0.122668 0.033562 50.6
8 infermiere -0.072265 -0.109098 -0.097028 -0.033959 infermiera -0.214588 -0.219005 -0.119956 -0.012385 77.0
9 geologo -0.000254 0.097662 0.148690 0.026921 geologa -0.233198 -0.201702 -0.121670 -0.004921 39.8
10 biologo 0.015788 0.103029 0.119144 0.073356 biologa -0.223442 -0.215060 -0.141161 -0.045970 71.4
11 zoologo 0.006750 0.088896 0.097710 -0.056328 zoologa -0.194228 -0.179436 -0.133293 -0.038391 70.5
12 filosofo 0.109790 0.168746 0.132203 0.041802 filosofa -0.163709 -0.172906 -0.122071 0.006861 51.9
In [80]:
truth_table.to_csv('truth_gen_ft_deb.csv', index=False)
In [81]:
# compute mean male-female projection
mf_mean_g_diff = (np.array(mf_truth_col1)+np.array(mf_truth_col6))/2
mf_mean_g_pca_0 = (np.array(mf_truth_col2)+np.array(mf_truth_col7))/2
mf_mean_g_pca_01 = (np.array(mf_truth_col3)+np.array(mf_truth_col8))/2
mf_mean_g_e = (np.array(mf_truth_col4)+np.array(mf_truth_col9))/2
In [82]:
# pearson correlation
r_mean_g_diff = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_diff)[0]
r_mean_g_pca_0 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_0)[0]
r_mean_g_pca_01 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_01)[0]
r_mean_g_e = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_e)[0]
In [83]:
pearson_data = {'g_diff':r_mean_g_diff,
                'g_pca_0':r_mean_g_pca_0,
                'g_pca_01':r_mean_g_pca_01,
                'g_e':r_mean_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
Out[83]:
g_diff g_pca_0 g_pca_01 g_e
r -0.33404 -0.286937 -0.214452 0.367544
In [84]:
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_diff')
In [85]:
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_diff)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_diff, c='brown', label='mean proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_diff_mean')
In [86]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)

ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_0')
In [87]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_0)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_0, c='brown', label='mean proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_0_mean')
In [88]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_01')
In [89]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_01)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_01, c='brown', label='mean proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_01_mean')
In [90]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_e')
In [91]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_e)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_e, c='brown', label='mean proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_e_mean')

Step 3: working with Gonen's embeddings

Step 3.1: repeating steps 2.1-2.5

In [92]:
# gender direction
g_diff = E_g.diff('lui', 'lei')
In [93]:
# list of pairs to define gender
'''
gender_pairs = [['lei','lui'],
                ['donna','uomo'], 
                ['madre','padre'],
                ['moglie','marito'],
                ['sorella','fratello'],
                ['femmina','maschio']]
'''

gender_pairs = [['lui','lei'],
                ['uomo','donna'], 
                ['padre','madre'],
                ['marito','moglie'],
                ['fratello','sorella'],
                ['maschio','femmina']]
In [94]:
# PCA
pca = we.doPCA(gender_pairs, E_g)

# PCA components
pc = pca.components_

# singular values
sv = pca.singular_values_
In [95]:
print("Singular values:")
print(sv)
Singular values:
[6.73165823e-01 4.74985087e-01 4.38029897e-01 3.35332203e-01
 2.90767211e-01 2.36732569e-01 1.27794811e-16 1.13799029e-16
 9.10948385e-17 8.73430101e-17]
In [96]:
plt.bar(range(pca.n_components_), sv)
plt.title("Singular values")
plt.show()
In [97]:
# gender direction
g_pca_0  = pc[0]/LA.norm(pc[0])
g_pca_01 = (sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1])/LA.norm((sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1]))
In [98]:
'''
S_word = [['lei','lui'],
          ['donna','uomo'],
          ['madre','padre'],
          ['moglie','marito'],
          ['sorella','fratello'],
          ['femmina','maschio']]
'''

S_word = [['lui','lei'],
          ['uomo','donna'], 
          ['padre','madre'],
          ['marito','moglie'],
          ['fratello','sorella'],
          ['maschio','femmina']]

G_word = db.read('gram_def_mf.txt', 'mf')
In [99]:
S_m_v = []
for w in S_word:
    S_m_v.append(E_g.v(w[1]))
    
S_f_v = []
for w in S_word:
    S_f_v.append(E_g.v(w[0]))
    
G_m_v = []
for w in G_word:
    G_m_v.append(E_g.v(w[0]))
    
G_f_v = []
for w in G_word:
    G_f_v.append(E_g.v(w[1]))

S_m = np.array(S_m_v).T
S_f = np.array(S_f_v).T
G_m = np.array(G_m_v).T
G_f = np.array(G_f_v).T
In [100]:
# delta semantic
Delta_S = np.dot(np.subtract(S_m, S_f), np.subtract(S_m, S_f).T)

# delta grammatical
Delta_G = np.dot(np.subtract(G_m, G_f), np.subtract(G_m, G_f).T)

# A
A = np.dot(np.linalg.inv(Delta_G), Delta_S)
In [101]:
# eigenvalues and eigenvectors
w, v = np.linalg.eig(A)

# gender direction: pick the eigenvector associated to the largest eigenvalue
g_e = v[:,np.argmax(w)] # already normalized
In [102]:
# read professions
prof_ung_truth = db.read('professions_ung.csv', 'truth')
In [103]:
# compute projection of each profession on g_diff
proj_truth_g_diff = db.prof_proj(E_g, prof_ung_truth, g_diff, 'istat')

# compute projection of each profession on g_pca_0
proj_truth_g_pca_0 = db.prof_proj(E_g, prof_ung_truth, g_pca_0, 'istat')

# compute projection of each profession on g_pca_01
proj_truth_g_pca_01 = db.prof_proj(E_g, prof_ung_truth, g_pca_01, 'istat')

# compute projection of each profession on g_e
proj_truth_g_e = db.prof_proj(E_g, prof_ung_truth, g_e, 'istat')
In [104]:
# print results
truth_col0 = [] # pandas columns
truth_col1 = []
truth_col2 = []
truth_col3 = []
truth_col4 = []
truth_col5 = []
truth_col6 = []

for i in range(len(proj_truth_g_diff)):
    truth_col0.append(proj_truth_g_diff[i][0]) # profession
    truth_col1.append(proj_truth_g_diff[i][1]) # projection on g_diff
    truth_col2.append(-proj_truth_g_pca_0[i][1]) # projection on g_pca_0
    truth_col3.append(-proj_truth_g_pca_01[i][1]) # projection on g_pca_01
    truth_col4.append(-np.real(proj_truth_g_e[i][1])) # projection on g_e
    truth_col5.append(proj_truth_g_diff[i][2]) # %male
    truth_col6.append(proj_truth_g_diff[i][3]) # %female

    
truth_data = {'Profession':truth_col0, 
           'Proj g_diff':truth_col1,
           'Proj g_pca_0':truth_col2, 
           'Proj g_pca_01':truth_col3,
           'Prog g_e':truth_col4,
           '% male':truth_col5, 
           '% female':truth_col6}
truth_table = pd.DataFrame(truth_data)
truth_table
Out[104]:
Profession Proj g_diff Proj g_pca_0 Proj g_pca_01 Prog g_e % male % female
0 camionista -0.116571 -0.065065 -0.090247 -0.021633 96.9 3.1
1 elettricista -0.032968 0.039751 0.006391 0.036579 96.8 3.2
2 ingegnere 0.048153 0.174863 0.120834 0.060551 83.6 16.4
3 architetto 0.094760 0.186434 0.117584 0.024187 64.7 35.3
4 notaio 0.045788 0.054650 -0.022539 -0.051675 66.4 33.6
5 commercialista -0.058101 0.046732 -0.014279 -0.005193 68.2 31.8
6 giornalista -0.111980 -0.057983 -0.094459 -0.029385 54.7 45.3
7 veterinario -0.083075 0.017559 -0.041211 0.051057 40.8 59.2
8 insegnante -0.157774 -0.082164 -0.162323 0.007874 17.3 82.7
9 barista -0.187549 -0.134068 -0.184296 -0.017124 38.9 61.1
10 medico 0.011663 0.018053 -0.062328 0.027769 56.3 43.7
11 ostetrica -0.099285 -0.115064 -0.140822 0.009814 1.7 98.3
12 dentista -0.163118 -0.062662 -0.095312 -0.009986 64.8 35.2
13 badante -0.258226 -0.271156 -0.292702 -0.002045 12.2 87.8
14 farmacista -0.058247 -0.036439 -0.094893 0.001469 30.8 69.2
15 igienista 0.042426 0.072872 0.026160 0.029747 26.3 73.7
16 preside -0.128567 -0.079019 -0.139150 -0.021458 33.8 66.2
In [105]:
truth_table.to_csv('truth_ung_g.csv', index=False)
In [106]:
# pearson correlation
r_g_diff = scipy.stats.pearsonr(truth_col6, truth_col1)[0]
r_g_pca_0 = scipy.stats.pearsonr(truth_col6, truth_col2)[0]
r_g_pca_01 = scipy.stats.pearsonr(truth_col6, truth_col3)[0]
r_g_e = scipy.stats.pearsonr(truth_col6, truth_col4)[0]
In [107]:
pearson_data = {'g_diff':r_g_diff,
                'g_pca_0':r_g_pca_0,
                'g_pca_01':r_g_pca_01,
                'g_e':r_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
Out[107]:
g_diff g_pca_0 g_pca_01 g_e
r -0.408154 -0.552042 -0.595359 -0.050896
In [108]:
# correlation plot g_diff
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_diff")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_diff')
In [109]:
# correlation plot g_pca_0
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_0")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_pca_0')
In [110]:
# correlation plot g_pca_01
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_01")

ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_pca_01')
In [111]:
# correlation plot g_e
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_e")
ax.legend(facecolor='white')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_e')
In [112]:
# scatter plots

plt.figure(figsize=(18,18))    

ax1 = plt.subplot(221)
ax1.scatter(truth_col6, truth_col1, c='blue')
ax1.set_title('g_diff')
ax1.set_xlabel('% female')
ax1.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
ax2 = plt.subplot(222)
ax2.scatter(truth_col6, truth_col2, c='red')
ax2.set_title('g_pca_0')
ax2.set_xlabel('% female')
ax2.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
ax3 = plt.subplot(223)
ax3.scatter(truth_col6, truth_col3, c='green')
ax3.set_title('g_pca_01')
ax3.set_xlabel('% female')
ax3.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
ax4 = plt.subplot(224)
ax4.scatter(truth_col6, truth_col4, c='magenta')
ax4.set_title('g_e')
ax4.set_xlabel('% female')
ax4.set_ylabel('proj value')

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center


plt.show()
fig.savefig('plot/ung-prof/scatter/g_all')
In [113]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.set_title('g_diff vs g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_diff_vs_g_pca_0')
In [114]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_diff vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_diff_vs_g_pca_01')
In [115]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_diff vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_diff_vs_g_e')
In [116]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_pca_0 vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_pca_0_vs_g_pca_01')
In [117]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_0 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_pca_0_vs_g_e')
In [118]:
fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_01 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_pca_01_vs_g_e')
In [119]:
# read professions
prof_gen_truth = db.read('professions_gen.csv', 'truth-mf')
In [120]:
# compute projection of each profession on g_diff
mf_proj_truth_g_diff = db.prof_proj(E_g, prof_gen_truth, g_diff, 'istat-mf')

# compute projection of each profession on g_pca_0
mf_proj_truth_g_pca_0 = db.prof_proj(E_g, prof_gen_truth, g_pca_0, 'istat-mf')

# compute projection of each profession on g_pca_01
mf_proj_truth_g_pca_01 = db.prof_proj(E_g, prof_gen_truth, g_pca_01, 'istat-mf')

# compute projection of each profession on g_e
mf_proj_truth_g_e = db.prof_proj(E_g, prof_gen_truth, g_e, 'istat-mf')
In [121]:
# print results
mf_truth_col0 = [] # pandas columns
mf_truth_col1 = []
mf_truth_col2 = []
mf_truth_col3 = []
mf_truth_col4 = []
mf_truth_col5 = []
mf_truth_col6 = []
mf_truth_col7 = []
mf_truth_col8 = []
mf_truth_col9 = []
mf_truth_col10 = []

for i in range(len(mf_proj_truth_g_diff)):
    mf_truth_col0.append(mf_proj_truth_g_diff[i][0]) # male profession
    mf_truth_col1.append(mf_proj_truth_g_diff[i][1]) # male projection on g_diff
    mf_truth_col2.append(-mf_proj_truth_g_pca_0[i][1]) # male projection on g_pca_0
    mf_truth_col3.append(-mf_proj_truth_g_pca_01[i][1]) # male projection on g_pca_01
    mf_truth_col4.append(-np.real(mf_proj_truth_g_e[i][1])) # male projection on g_e
    mf_truth_col5.append(mf_proj_truth_g_diff[i][2]) # female profession
    mf_truth_col6.append(mf_proj_truth_g_diff[i][3]) # female projection on g_diff
    mf_truth_col7.append(mf_proj_truth_g_pca_0[i][3]) # female projection on g_pca_0
    mf_truth_col8.append(mf_proj_truth_g_pca_01[i][3]) # female projection on g_pca_01
    mf_truth_col9.append(np.real(mf_proj_truth_g_e[i][3])) # female projection on g_e
    mf_truth_col10.append(mf_proj_truth_g_diff[i][4]) # %female
    
mf_truth_data = {'Male profession':mf_truth_col0, 
           'M-Proj g_diff':mf_truth_col1,
           'M-Proj g_pca_0':mf_truth_col2, 
           'M-Proj g_pca_01':mf_truth_col3,
           'M-Prog g_e':mf_truth_col4,
           'Female profession':mf_truth_col5, 
           'F-Proj g_diff':mf_truth_col6,
           'F-Proj g_pca_0':mf_truth_col7, 
           'F-Proj g_pca_01':mf_truth_col8,
           'F-Prog g_e':mf_truth_col9,  
           '% female':mf_truth_col10}

mf_truth_table = pd.DataFrame(mf_truth_data)
mf_truth_table
Out[121]:
Male profession M-Proj g_diff M-Proj g_pca_0 M-Proj g_pca_01 M-Prog g_e Female profession F-Proj g_diff F-Proj g_pca_0 F-Proj g_pca_01 F-Prog g_e % female
0 biologo 0.034183 0.093461 0.075028 0.044190 biologa -0.212560 0.265112 0.271275 0.001866 76.0
1 avvocato -0.037062 0.129062 0.058017 -0.027940 avvocatessa -0.254444 0.224026 0.208353 -0.001100 47.2
2 psicologo -0.044157 0.026535 -0.016280 -0.006756 psicologa -0.286778 0.343406 0.315514 0.049775 82.9
3 maestro 0.088839 0.118162 -0.005922 -0.060189 maestra -0.219276 0.240546 0.260523 0.048168 96.4
4 professore -0.032111 0.055725 -0.024257 0.059632 professoressa -0.270093 0.320515 0.307693 0.031977 71.9
5 cameriere -0.114928 -0.032919 -0.107858 -0.020555 cameriera -0.310950 0.362993 0.319444 0.037913 51.4
6 infermiere -0.047930 -0.094381 -0.072629 -0.036522 infermiera -0.258774 0.285713 0.239436 0.039524 77.0
7 biologo 0.034183 0.093461 0.075028 0.044190 biologa -0.212560 0.265112 0.271275 0.001866 71.4
8 filosofo 0.135867 0.140866 0.031013 -0.021475 filosofa -0.103877 0.181049 0.202317 0.027595 51.9
In [122]:
truth_table.to_csv('truth_gen_g.csv', index=False)
In [123]:
# compute mean male-female projection
mf_mean_g_diff = (np.array(mf_truth_col1)+np.array(mf_truth_col6))/2
mf_mean_g_pca_0 = (np.array(mf_truth_col2)+np.array(mf_truth_col7))/2
mf_mean_g_pca_01 = (np.array(mf_truth_col3)+np.array(mf_truth_col8))/2
mf_mean_g_e = (np.array(mf_truth_col4)+np.array(mf_truth_col9))/2
In [124]:
# pearson correlation
r_mean_g_diff = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_diff)[0]
r_mean_g_pca_0 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_0)[0]
r_mean_g_pca_01 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_01)[0]
r_mean_g_e = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_e)[0]
In [125]:
pearson_data = {'g_diff':r_mean_g_diff,
                'g_pca_0':r_mean_g_pca_0,
                'g_pca_01':r_mean_g_pca_01,
                'g_e':r_mean_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
Out[125]:
g_diff g_pca_0 g_pca_01 g_e
r 0.074269 0.032458 0.213333 0.227029
In [126]:
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_diff')
In [127]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_diff)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_diff, c='brown', label='mean proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_diff_mean')
In [128]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)

ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_0')
In [129]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_0)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_0, c='brown', label='mean proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col2[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
    
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_0_mean')
In [130]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_01')
In [131]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_01)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_01, c='brown', label='mean proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col3[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_01_mean')
In [132]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_e')
In [133]:
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'

slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_e)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'

fig = plt.figure(figsize=(10,10))    
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_e, c='brown', label='mean proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()

for i,label in enumerate(mf_truth_col0):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i,label in enumerate(mf_truth_col5):
    plt.annotate(label, # this is the text
                 (mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

for i, label in enumerate(truth_col0):
    plt.annotate(label, # this is the text
                 (truth_col6[i], truth_col4[i]), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_e_mean')
In [ ]: